package com.spider.downloader;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.spider.date.DBCenter;
import com.spider.date.WebName;



public class Scheduler {
	public Scheduler(){
		
	}
	public void cleanHtml(String html) throws IOException{
		Document doc = Jsoup.parse(html);
		Elements links = doc.select("a");//获得<a>.....</a>标签
		List<String> list_url = new ArrayList<String>();
		String url = null;
		for(Element link:links){
			url = link.attr("href");
			if(url.length()==1){
				//System.out.println(url);//获得 href连接内容
				continue;
			}
			if(!url.contains("http:")){
				url = "http://bj.ganji.com"+ url;
			}
			list_url.add(url);
		}
		Map<String , List<String>> map = DBCenter.getUrl_map();
		map.put(WebName.GANJI, list_url);    //此Map暂时只保存赶集网连接，若要做扩展，刚只需该key值
		createHtmlPool();
	}
	public void createHtmlPool(){
		Map<String , List<String>> map = DBCenter.getUrl_map();
		List<String> list_url = map.get(WebName.GANJI);
		Downloader download = new Downloader();
		for(int i=0;i<10;i++){
			System.out.println(list_url.get(i));
			download.getCatchPage(list_url.get(i));
		}
		
	}
}
